#loading packages

library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.3.3
## Warning: package 'readr' was built under R version 4.3.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   4.0.0     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(lubridate)
library(highcharter)
## Warning: package 'highcharter' was built under R version 4.3.3
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo 
## Highcharts (www.highcharts.com) is a Highsoft software product which is
## not free for commercial and Governmental use
library(plotly)
## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout
library(patchwork)
library(readxl)

data

global_income = read.csv("global_income_inequality.csv")

global_income = global_income |>
  rename(average_income_USD = Average.Income..USD.,
         top10_income_share = Top.10..Income.Share....,
         bottom10_income_share = Bottom.10..Income.Share....,
         gini_index = Gini.Index,
         income_group = Income.Group,
         country = Country,
         year = Year,
         population = Population) |>
  group_by(country, year) |>
  ungroup()

Country_data = read_excel("Country_data.xlsx")

Country_data = Country_data |>
  rename(country = Country,
         region = Region) |>
  mutate(country = recode(country,
                               "United Kingdom of Great Britain and Northern Ireland" = "United Kingdom",
                               "Russian Federation" = "Russia",
                          "United States of America" = "United States"))

global_income = global_income |>
  inner_join(Country_data, by = "country")

#sum missing

sum_miss = function(col){
  sum(is.na(col))
}

lapply(global_income, sum_miss)
## $country
## [1] 0
## 
## $year
## [1] 0
## 
## $population
## [1] 0
## 
## $gini_index
## [1] 0
## 
## $average_income_USD
## [1] 0
## 
## $top10_income_share
## [1] 0
## 
## $bottom10_income_share
## [1] 0
## 
## $income_group
## [1] 0
## 
## $`Country code`
## [1] 0
## 
## $region
## [1] 0
p1 = global_income |>
  ggplot(aes(x = gini_index, y = average_income_USD, color = region, frame = year)) +
           geom_point(aes(size = population))

ggplotly(p1)
p2 = global_income |>
  ggplot(aes(x = gini_index, y = top10_income_share, color = region, frame = year)) +
           geom_point(aes(size = population))

ggplotly(p2)
p3 = global_income |>
  ggplot(aes(x = gini_index, y = top10_income_share, color = region, frame = year)) +
           geom_point(aes(size = population))

ggplotly(p3)
top3_unequal = global_income |>
  filter(year %in% 2023) |>
  arrange(desc(gini_index)) |>
  head(n = 3) |>
  pull(country)

bottom3_unequal = global_income |>
  filter(year %in% 2023) |>
  arrange(gini_index) |>
  head(n = 3) |>
  pull(country)

global_income |>
  filter(country %in% top3_unequal) |>
  hchart(type = "line", hcaes(x = year, y = gini_index, group = country)) |>
  hc_title(text = "Average income over time by country (Most disparities)")
global_income |>
  filter(country %in% bottom3_unequal) |>
  hchart(type = "line", hcaes(x = year, y = gini_index, group = country)) |>
  hc_title(text = "Average income over time by country (Least disparities)")
## Potential visuals

c1 = global_income |>
  filter(year > 2019) |>
  group_by(year) |>
  ggplot(aes(x = as.factor(year), y = gini_index)) +
  geom_boxplot() +
  coord_flip()

c2 = global_income |>
  filter(year <= 2019, year >= 2015) |>
  group_by(year) |>
  ggplot(aes(x = as.factor(year), y = gini_index)) +
  geom_boxplot() +
  coord_flip()

c1 / c2

global_income |>
  group_by(year) |>
  summarize(mean_gini = mean(gini_index), median_gini = median(gini_index)) |>
  pivot_longer(cols = c(mean_gini, median_gini),
    names_to = "type",
               values_to = "value") |>
    hchart(type = "line", hcaes(x = year, y = value, group = type))
global_income |>
  group_by(year) |>
  summarize(mean_income = mean(average_income_USD), median_income = median(average_income_USD)) |>
  pivot_longer(cols = c(mean_income, median_income),
    names_to = "type",
               values_to = "value") |>
    hchart(type = "line", hcaes(x = year, y = value, group = type))
global_income |>
  group_by(year, income_group) |>
  summarize(mean_income = mean(average_income_USD), 
            mean_gini = mean(gini_index)) |>
  filter(income_group %in% c("High Income", "Low Income")) |>
  hchart(type = "line", hcaes(x = year, y = mean_gini, group = income_group))
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
global_income |>
  filter(country %in% "United States") |>
  mutate(g_color = ifelse(gini_index < 0.3, "#34A853","#EA4335")) |>
hchart(type = "column", hcaes(x = year, 
                              y = gini_index, 
                              color = g_color)) 
global_income |>
  group_by(year, region) |>
  summarize(mean_income = mean(average_income_USD), 
            mean_gini = mean(gini_index), .groups = "drop") |>
  filter(region %in% c("NA", "LATAM")) |>
  hchart(type = "line", hcaes(x = year, y = mean_income, group = region))
global_income |>
  group_by(year, region) |>
  summarize(mean_income = mean(average_income_USD), 
            mean_gini = mean(gini_index), .groups = "drop") |>
  filter(region %in% c("EMEA", "APAC")) |>
  hchart(type = "line", hcaes(x = year, y = mean_income, group = region))